The rivers of the world are home to numerous fish species whose existence is dependent on the temperature of the water. Specifically for salmonid, read this article by Katharine Carter, an environmental scientist and lover of fish from sunny California. Salmonid varieties all thrive under different temperature ranges and issues arise when river temperatures are outside these ranges.
As we have been notified, it’s getting hot in here. Global warming is happening, and these fish are getting heatstroke. Because of these “fake” facts, protectors of the water have become interested in developing predictive models for maximum water temperature.
In this notebook, the dataset contains close to a full year of daily observed maximum air and maximum water temperatures for 31 different rivers in Spain. The variable D represents the Julian day and takes values from 1 to 365. The factor variable L identifies 31 different measurement station locations. Variables W and A are the maximum water and air temperatures, respectively. Finally, the variable named T for time maintains the order of the data according to when it was observed.
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import scipy.stats as stats
from IPython.display import display, HTML
#import data
df = pd.read_csv('AirWaterTemp.csv')
# Function to create scrollable table within a small window
def create_scrollable_table(df, table_id, title):
html = f'<h3>{title}</h3>'
html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
html += df.to_html()
html += '</div>'
return html
df.shape
(11337, 5)
df.head()
| D | L | W | A | T | |
|---|---|---|---|---|---|
| 0 | 1 | 103 | 14.2 | 21.2 | 1 |
| 1 | 2 | 103 | 14.4 | 16.8 | 2 |
| 2 | 3 | 103 | 14.4 | 15.4 | 3 |
| 3 | 4 | 103 | 10.9 | 10.8 | 4 |
| 4 | 5 | 103 | 10.8 | 11.7 | 5 |
numerical_features = df.select_dtypes(include=[np.number])
numerical_features.describe()
| D | L | W | A | T | |
|---|---|---|---|---|---|
| count | 11337.00000 | 11337.000000 | 9857.000000 | 9857.000000 | 11337.00000 |
| mean | 183.35512 | 423.573256 | 16.221183 | 19.602871 | 183.35512 |
| std | 105.57604 | 315.171635 | 6.009886 | 7.898868 | 105.57604 |
| min | 1.00000 | 103.000000 | 1.800000 | -4.000000 | 1.00000 |
| 25% | 92.00000 | 126.000000 | 11.300000 | 13.500000 | 92.00000 |
| 50% | 183.00000 | 307.000000 | 15.700000 | 19.000000 | 183.00000 |
| 75% | 275.00000 | 810.000000 | 20.500000 | 25.000000 | 275.00000 |
| max | 366.00000 | 920.000000 | 37.200000 | 41.000000 | 366.00000 |
# Summary statistics for numerical features
numerical_features = df.select_dtypes(include=[np.number])
summary_stats = numerical_features.describe().T
html_numerical = create_scrollable_table(summary_stats, 'numerical_features', 'Summary statistics for numerical features')
display(HTML(html_numerical))
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| D | 11337.0 | 183.355120 | 105.576040 | 1.0 | 92.0 | 183.0 | 275.0 | 366.0 |
| L | 11337.0 | 423.573256 | 315.171635 | 103.0 | 126.0 | 307.0 | 810.0 | 920.0 |
| W | 9857.0 | 16.221183 | 6.009886 | 1.8 | 11.3 | 15.7 | 20.5 | 37.2 |
| A | 9857.0 | 19.602871 | 7.898868 | -4.0 | 13.5 | 19.0 | 25.0 | 41.0 |
| T | 11337.0 | 183.355120 | 105.576040 | 1.0 | 92.0 | 183.0 | 275.0 | 366.0 |
# Null values in the dataset
null_values = df.isnull().sum()
html_null_values = create_scrollable_table(null_values.to_frame(), 'null_values', 'Null values in the dataset')
# Percentage of missing values for each feature
missing_percentage = (df.isnull().sum() / len(df)) * 100
html_missing_percentage = create_scrollable_table(missing_percentage.to_frame(), 'missing_percentage', 'Percentage of missing values for each feature')
display(HTML(html_null_values + html_missing_percentage))
| 0 | |
|---|---|
| D | 0 |
| L | 0 |
| W | 1480 |
| A | 1480 |
| T | 0 |
| 0 | |
|---|---|
| D | 0.0000 |
| L | 0.0000 |
| W | 13.0546 |
| A | 13.0546 |
| T | 0.0000 |
# Exploring rows with missing values
rows_with_missing_values = df[df.isnull().any(axis=1)]
html_rows_with_missing_values = create_scrollable_table(rows_with_missing_values.head(), 'rows_with_missing_values', 'Rows with missing values')
display(HTML(html_rows_with_missing_values))
| D | L | W | A | T | |
|---|---|---|---|---|---|
| 97 | 98 | 103 | NaN | NaN | 98 |
| 104 | 105 | 103 | NaN | NaN | 105 |
| 134 | 135 | 103 | NaN | NaN | 135 |
| 168 | 169 | 103 | NaN | NaN | 169 |
| 215 | 216 | 103 | NaN | NaN | 216 |
df.columns
Index(['D', 'L', 'W', 'A', 'T'], dtype='object')
import scipy.stats as stats
df['W'].fillna(df['W'].mean(), inplace=True)
# Check for and remove any remaining non-finite values
df = df[np.isfinite(df['W'])]
# Fit a normal distribution to the water temp data
mu, sigma = stats.norm.fit(df['W'])
# Create a histogram of the water temp column
hist_data = go.Histogram(x=df['W'], nbinsx=50, name="Histogram", opacity=0.75, histnorm='probability density', marker=dict(color='purple'))
# Calculate the normal distribution based on the fitted parameters
x_norm = np.linspace(df['W'].min(), df['W'].max(), 100)
y_norm = stats.norm.pdf(x_norm, mu, sigma)
# Create the normal distribution overlay
norm_data = go.Scatter(x=x_norm, y=y_norm, mode="lines", name=f"Normal dist. (μ={mu:.2f}, σ={sigma:.2f})", line=dict(color="green"))
# Combine the histogram and the overlay
fig = go.Figure(data=[hist_data, norm_data])
# Set the layout for the plot
fig.update_layout(
title="Water Temp Distribution",
xaxis_title="Water Temp",
yaxis_title="Density",
legend_title_text="Fitted Normal Distribution",
plot_bgcolor='rgba(32, 32, 32, 1)',
paper_bgcolor='rgba(32, 32, 32, 1)',
font=dict(color='white')
)
fig.show()
Reg: Regular IR1: Slightly irregular IR2: Moderately irregular IR3: Irregular
# Calculate Correlation between water temp and air temp
temp_corr = df['W'].corr(df['A'])
print(f'Correlation between water temp and air temp: {temp_corr}')
# Create a scatter plot to visualize the relationship
fig9 = px.scatter(df, x='A', y='W', title='Water Temp vs Air Temp', color='T', color_continuous_scale=px.colors.sequential.Purp)
fig9.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))
fig9.show()
Correlation between water temp and air temp: 0.8564445297954019
#Line plot of water temp over the day
daily_avg_water_temp = df.groupby('D')['W'].mean()
fig13 = px.box(df, x='D', y='W', title='Water Temp Trends Over the days',
points=False, color_discrete_sequence=['green'])
fig13.add_trace(px.line(x=daily_avg_water_temp.index, y=daily_avg_water_temp.values).data[0])
fig13.update_traces(line=dict(color='purple', width=4), selector=dict(type='scatter', mode='lines'))
for day, avg_water_temp in daily_avg_water_temp.items():
fig13.add_annotation(
x=day,
y=avg_water_temp,
text=f"{avg_water_temp:,.0f}",
font=dict(color='white'),
showarrow=False,
bgcolor='rgba(128, 0, 128, 0.6)'
)
fig13.update_layout(
plot_bgcolor='rgb(30,30,30)',
paper_bgcolor='rgb(30,30,30)',
font=dict(color='white'),
xaxis_title='Day',
yaxis_title='Water Temp'
)
fig13.show()
Why do this? - So we have consistent infrastructure for transforming the test set
Goal - To create infrastructure that lets us make changes without breaking everything
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])
# Update cnumerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('W')
# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_columns)
],remainder = 'passthrough')
# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
('preprocessor', preprocessor)])
# Apply the pipeline to your dataset
X = df.drop('W', axis=1)
y = np.log(df['W']) #normalize dependent variable
X_preprocessed = pipeline.fit_transform(X)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)
# Define the models
models = {
'LinearRegression': LinearRegression(),
'RandomForest': RandomForestRegressor(random_state=42),
'XGBoost': XGBRegressor(random_state=42)
}
# Define the hyperparameter grids for each model
param_grids = {
'LinearRegression': {},
'RandomForest': {
'n_estimators': [100, 200, 500],
'max_depth': [None, 10, 30],
'min_samples_split': [2, 5, 10],
},
'XGBoost': {
'n_estimators': [100, 200, 500],
'learning_rate': [0.01, 0.1, 0.3],
'max_depth': [3, 6, 10],
}
}
# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)
# Train and tune the models
grids = {}
for model_name, model in models.items():
#print(f'Training and tuning {model_name}...')
grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grids[model_name].fit(X_train, y_train)
best_params = grids[model_name].best_params_
best_score = np.sqrt(-1 * grids[model_name].best_score_)
print(f'Best parameters for {model_name}: {best_params}')
print(f'Best RMSE for {model_name}: {best_score}\n')
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters for LinearRegression: {}
Best RMSE for LinearRegression: 0.20330111023689443
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Best RMSE for RandomForest: 0.09588117543355605
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 500}
Best RMSE for XGBoost: 0.07381767995214228
from sklearn.neural_network import MLPRegressor
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
# Create an MLPRegressor instance
mlp = MLPRegressor(random_state=42,max_iter=10000, n_iter_no_change = 3,learning_rate_init=0.001)
# Define the parameter grid for tuning
param_grid = {
'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (25)],
'activation': ['relu', 'tanh'],
'solver': ['adam'],
'alpha': [0.0001, 0.001, 0.01],
'learning_rate': ['constant', 'invscaling', 'adaptive'],
}
# Create the GridSearchCV object
grid_search_mlp = GridSearchCV(mlp, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=1)
# Fit the model on the training data
grid_search_mlp.fit(X_train_scaled, y_train)
# Print the best parameters found during the search
print("Best parameters found: ", grid_search_mlp.best_params_)
# Evaluate the model on the test data
best_score = np.sqrt(-1 * grid_search_mlp.best_score_)
print("Test score: ", best_score)
Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters found: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'solver': 'adam'}
Test score: 0.16510638754315862
model_names = ['LinearRegression', 'RandomForest', 'XGBoost']
models_dict = {
'LinearRegression': LinearRegression(),
'RandomForest': RandomForestRegressor(random_state=42, **grids['RandomForest'].best_params_),
'XGBoost': XGBRegressor(random_state=42, **grids['XGBoost'].best_params_)
}
# Create a DataFrame for the comparison
comparison_df = pd.DataFrame({'True Water Temperature': np.exp(y_test)})
# Add columns for predicted temperatures by each model
for model_name in model_names:
model = models_dict[model_name]
model.fit(X_train, y_train)
comparison_df[f'Predicted {model_name}'] = np.exp(model.predict(X_test))
# Revert the scaling of the variable on the x-axis
comparison_df['True Water Temperature'] = comparison_df['True Water Temperature'] * 10
# Create a scatter plot with regression line
fig_comparison = px.scatter(comparison_df, x='True Water Temperature', y=[f'Predicted {model_name}' for model_name in model_names],
labels={'value': 'Predicted Water Temperature'},
title='Actual vs Predicted Water Temperatures for Different Models',
trendline='ols', # Ordinary Least Squares regression line
)
fig_comparison.update_layout(
plot_bgcolor='rgb(30,30,30)',
paper_bgcolor='rgb(30,30,30)',
font=dict(color='white')
)
# Show the plot
fig_comparison.show()